########################################################
## This code carries out parameter tuning 
## for the model that excludes recalled employees.
########################################################

########################################################
## Preparation of the workspace
########################################################

## remove all objects from the current workspace
rm(list=ls())

## load the required packages
library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)
library(VIM)
library(base)
library(ranger)
library(glmnet)
library(xgboost)

## display the current time -> to check how much time it takes to run the code
start_time = Sys.time()

## set the directories
main <- "placeholder_main"

dataDirectory <- paste0(main, "/Data")
RDirectory <- paste0(main, "/Programs/Output Generation")

## Load the tuning and training functions:
source(paste0(RDirectory, "/102_0_caret_parameter_tuning_Function.R"))
source(paste0(RDirectory, "/103_0_caret_predictions_Function.R"))



########################################################
## Define the inputs for the loop
########################################################
 
# Define the outcome variables that are predicted
dependent <- c("emplAft6M_0M_In")
seed <- 2111

## Load the data set:
data <- read_dta(paste0(dataDirectory, "/002_DataForR_Full_NoRecalls_2006.dta"))

## Data cleaning:
y <- dependent

# Correcting format of outcome variable:
data[[y]] <- factor(data[[y]])
levels(data[[y]]) <- c("no", "yes")

# Keeping only those observations with non-missing outcome variables
data <- data[complete.cases(data[, y]), ] 

## Old way to select variables:
first_column <- which(colnames(data)=="Gender") ## Identifying column number where the variables of interest start
y_column <- which(colnames(data) == dependent) ## Column number with the dependent variable of interest

data.in <- data[, c(y_column, first_column:ncol(data))] ## Keeping all the possible covariates we want to have in the model


data_tune <- data.in[data$samp == 1 & data$recalled == 0,]
data_train <- data.in[data$samp == 2 & data$recalled == 0,]
data_pred <- data.in[data$samp %in% c(3, 4, 5),]
persinfo <- data[data$samp %in% c(3, 4, 5), c("LopNr_PersonNr", "InLnr", "n", "samp", "recalled")]


## Run the tuning function to get the hyperparameters:
cat("--> Tuning")
parameters <- tuning.int(data = data_tune, 
                         dependent = y,
                         seed = seed, noisily = TRUE)

write.csv(parameters$rfgrid_final, file = paste0(dataDirectory,"/102_rfgrid_Full_NoRecalls_", dependent, "_2006.csv"), row.names = FALSE)
write.csv(parameters$boostgrid_final, file = paste0(dataDirectory,"/102_boostgrid_Full_NoRecalls_", dependent, "_2006.csv"), row.names = FALSE)
write.csv(parameters$lassogrid_final, file = paste0(dataDirectory,"/102_lassogrid_Full_NoRecalls_", dependent, "_2006.csv"), row.names = FALSE)


## Run the training function to train the three algorithms:
cat("--> Training")
models <- training.int(data = data_train, 
                       dependent = y, 
                       parameters = parameters,
                       seed = seed, noisily = TRUE)


## Save models
save(models, 
     file=paste0(dataDirectory,"/103_Models_Full_NoRecalls_", dependent, "_2006.rda"))


## Finally, get the predictions:
cat("--> Predicting")

# Split data into dependent and independent variables:
total_y <- data_pred[[dependent]]
total_x <- data_pred[, setdiff(names(data_pred), dependent)]

## Create predictions from the models

## Creating predictions Random Forest
pred_rf <- predict(models$rff_final, total_x)
pred_rf <- pred_rf[["predictions"]]
pred_rf <- data.frame(pred_rf[, 2])

## Creating predictions Gradient Boost
pred_boost <- predict(models$rboost_final, data.matrix(total_x))

## Creating predictions LASSO
pred_lasso <- predict(models$rlasso_final, data.matrix(total_x), type = "response")

# Merge with personal information:
predictions <- data.frame(total_y, pred_rf, pred_boost, pred_lasso)

# Rename predictions:
colnames(predictions) <- c(y, 
                           paste0("p_", y, "_rf"),
                           paste0("p_", y, "_boost"),
                           paste0("p_", y, "_lasso"))

output <- cbind(persinfo, predictions)

## Save predictions
write.csv(output, 
          file=paste0(dataDirectory,"/103_predictionsR_Full_NoRecalls_", dependent, "_2006.csv"))


########################################################
## Same at 6M and 12M
########################################################

parameters <- list()
parameters$rfgrid_final <- read.csv(file = paste0(dataDirectory,"/102_rfgrid_Full_NoRecalls_", "emplAft6M_0M_In", "_2006.csv"))
parameters$boostgrid_final <- read.csv(file = paste0(dataDirectory,"/102_boostgrid_Full_NoRecalls_", "emplAft6M_0M_In", "_2006.csv"))
parameters$lassogrid_final <- read.csv(file = paste0(dataDirectory,"/102_lassogrid_Full_NoRecalls_", "emplAft6M_0M_In", "_2006.csv"))


# Define the outcome variables that are predicted
for (dependent in c("emplAft6M_6M_In", "emplAft6M_12M_In")) {
  seed <- 2111
  
  ## Load the data set:
  data <- read_dta(paste0(dataDirectory, "/002_DataForR_Full_NoRecalls_2006.dta"))
  
  ## Data cleaning:
  y <- dependent
  
  # Correcting format of outcome variable:
  data[[y]] <- factor(data[[y]])
  levels(data[[y]]) <- c("no", "yes")
  
  # Keeping only those observations with non-missing outcome variables
  data <- data[complete.cases(data[, y]), ] 
  
  ## Old way to select variables:
  first_column <- which(colnames(data)=="Gender") ## Identifying column number where the variables of interest start
  y_column <- which(colnames(data) == dependent) ## Column number with the dependent variable of interest
  
  data.in <- data[, c(y_column, first_column:ncol(data))] ## Keeping all the possible covariates we want to have in the model
  
  data_train <- data.in[data$samp %in% c(1, 2) & data$recalled == 0,]
  data_pred <- data.in[data$samp %in% c(3, 4, 5),]
  persinfo <- data[data$samp %in% c(3, 4, 5), c("LopNr_PersonNr", "InLnr", "n", "samp", "recalled")]
  
  ## Run the training function to train the three algorithms:
  cat("--> Training")
  models <- training.int(data = data_train, 
                         dependent = y, 
                         parameters = parameters,
                         seed = seed, noisily = TRUE)
  
  ## Save models
  save(models, 
       file=paste0(dataDirectory,"/103_Models_Full_NoRecalls_", dependent, "_2006.rda"))
  
  
  ## Finally, get the predictions:
  cat("--> Predicting")
  
  # Split data into dependent and independent variables:
  total_y <- data_pred[[dependent]]
  total_x <- data_pred[, setdiff(names(data_pred), dependent)]
  
  ## Create predictions from the models
  
  ## Creating predictions Random Forest
  pred_rf <- predict(models$rff_final, total_x)
  pred_rf <- pred_rf[["predictions"]]
  pred_rf <- data.frame(pred_rf[, 2])
  
  ## Creating predictions Gradient Boost
  pred_boost <- predict(models$rboost_final, data.matrix(total_x))
  
  ## Creating predictions LASSO
  pred_lasso <- predict(models$rlasso_final, data.matrix(total_x), type = "response")
  
  # Merge with personal information:
  predictions <- data.frame(total_y, pred_rf, pred_boost, pred_lasso)
  
  # Rename predictions:
  colnames(predictions) <- c(y, 
                             paste0("p_", y, "_rf"),
                             paste0("p_", y, "_boost"),
                             paste0("p_", y, "_lasso"))
  
  output <- cbind(persinfo, predictions)
  
  ## Save predictions
  write.csv(output, 
            file=paste0(dataDirectory,"/103_predictionsR_Full_NoRecalls_", dependent, "_2006.csv"))
}



##################################################
## Creating predictions for people unemployed in X month using Y month model
########################################################   

for (dependent in c("emplAft6M_0M_In", "emplAft6M_6M_In", "emplAft6M_12M_In")) {
  seed <- 2111
  
  ## Load the data set:
  data <- read_dta(paste0(dataDirectory, "/002_DataForR_Full_NoRecalls_2006.dta"))
  
  ## Data cleaning:
  y <- dependent
  
  # Correcting format of outcome variable:
  data[[y]] <- factor(data[[y]])
  levels(data[[y]]) <- c("no", "yes")
  
  # Keeping only those observations with non-missing outcome variables
  data <- data[complete.cases(data[, y]), ] 
  
  ## Old way to select variables:
  first_column <- which(colnames(data)=="Gender") ## Identifying column number where the variables of interest start
  y_column <- which(colnames(data) == dependent) ## Column number with the dependent variable of interest
  
  data.in <- data[, c(y_column, first_column:ncol(data))] ## Keeping all the possible covariates we want to have in the model
  
  data_pred <- data.in[data$samp %in% c(3, 4, 5),]
  persinfo <- data[data$samp %in% c(3, 4, 5), c("LopNr_PersonNr", "InLnr", "n", "samp", "recalled")]
  
  ## Finally, get the predictions:
  cat("--> Predicting")
  
  # Split data into dependent and independent variables:
  total_y <- data_pred[[dependent]]
  total_x <- data_pred[, setdiff(names(data_pred), dependent)]
  
  for (unempl in c(0, 6, 12)) {
    
    if (dependent !=paste0("emplAft6M_",unempl,"M_In")) {
      
      ########################################################
      ## Load models
      ########################################################
      
      load(file=paste0(dataDirectory,"/103_Models_Full_NoRecalls_emplAft6M_", unempl, "M_In", "_2006.rda"))
      
      ########################################################
      ## Create predictions
      ########################################################
      
      ## Creating predictions Random Forest
      pred_rf <- predict(models$rff_final, total_x)
      pred_rf <- pred_rf[["predictions"]]
      pred_rf <- data.frame(pred_rf[, 2])
      
      ## Creating predictions Gradient Boost
      pred_boost <- predict(models$rboost_final, data.matrix(total_x))
      
      ## Creating predictions LASSO
      pred_lasso <- predict(models$rlasso_final, data.matrix(total_x), type = "response")
      
      # Merge with personal information:
      predictions <- data.frame(total_y, pred_rf, pred_boost, pred_lasso)
      
      # Rename predictions:
      colnames(predictions) <- c(y, 
                                 paste0("p_", y, "_rf"),
                                 paste0("p_", y, "_boost"),
                                 paste0("p_", y, "_lasso"))
      
      output <- cbind(persinfo, predictions)
      
      ########################################################
      ## Save predictions
      ########################################################
      write.csv(output, file=paste0(dataDirectory,"/103_predictionsR_Full_NoRecalls_", dependent, "_2006_emplAft6M_",unempl,"M_In_Model.csv"))
      rm("output") ## remove the dataset with output
      
      print(paste(dependent,"End")) ## display the time when loop ends for a year and y variable
      
      
      
    }
    
  }
}

